download.py 工具抓取訓練用資料import os
import sys
import argparse
import pandas as pd
from datetime import datetime, timedelta
def get_sp500_tickers() -> list:
    """
    從維基百科抓取S&P 500的股票代碼列表。
    """
    try:
        url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
        tables = pd.read_html(url)
        df = tables[0]
        tickers = df['Symbol'].tolist()
        return tickers
    except Exception as e:
        print(f"Error fetching S&P 500 tickers: {e}")
        return []
def get_dji30_tickers() -> list:
    """
    返回DJI 30的股票代碼列表。
    """
    return [
        'AAPL', 'AMGN', 'AXP', 'BA', 'CAT', 'CRM', 'CSCO', 'CVX', 'DIS', 'DOW',
        'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM',
        'MRK', 'MSFT', 'NKE', 'PG', 'TRV', 'UNH', 'V', 'VZ', 'WBA', 'WMT'
    ]
def main():
    sys.path.append(
        os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
    # 設置 argparse 來處理命令行參數
    parser = argparse.ArgumentParser(
        description="Polygon.io Data Fetching Command Line Tool")
    parser.add_argument('--sp500',
                        action='store_true',
                        help='將 S&P 500 成分股加入 ticker_list 中')
    parser.add_argument('--dow30',
                        action='store_true',
                        help='將 DJI 30 成分股加入 ticker_list 中')
    parser.add_argument('--ticker', type=str, help='將單個 ticker 加入 ticker_list')
    parser.add_argument('--tickers',
                        type=str,
                        nargs='+',
                        help='將多個 tickers 加入 ticker_list')
    parser.add_argument('--timespan',
                        '-t',
                        type=str,
                        default='hour',
                        help='指定抓取的時間跨度(如 minute, hour, day, week)')
    parser.add_argument('--multiplier',
                        '-m',
                        type=int,
                        default=1,
                        help='指定時間跨度的倍數 (如 15 表示每 15 分鐘, 默認為 1)')
    parser.add_argument('--start',
                        '-s',
                        type=str,
                        required=True,
                        help='指定開始日期 (YYYY-MM-DD)')
    parser.add_argument('--end',
                        '-e',
                        type=str,
                        required=True,
                        help='指定結束日期 (YYYY-MM-DD)')
    args = parser.parse_args()
    # 初始化 ticker_list
    ticker_list = []
    # 處理參數,將相應的 ticker 加入到 ticker_list
    if args.sp500:
        ticker_list += get_sp500_tickers()
    if args.dow30:
        ticker_list += get_dji30_tickers()
    if args.ticker:
        ticker_list.append(args.ticker)
    if args.tickers:
        ticker_list += args.tickers
    # 確認至少有一個 ticker 被指定
    if not ticker_list:
        print(
            "Error: 至少需要指定一個 ticker (通過 --sp500, --dow30, --ticker 或 --tickers)"
        )
        sys.exit(1)
    # 設定日期
    start_date_str = args.start
    end_date_str = args.end
    # 初始化 PolygonIODownloader
    from PolygonIO.PolygonIODownloader import PolygonIODownloader  # 請將 'your_module' 替換為包含上述類別的模組名稱
    polygon_wrapper = PolygonIODownloader(root_dir='./datasets')
    # 抓取數據
    for idx, ticker in enumerate(ticker_list, start=1):
        print(f"Fetching data for {ticker} ({idx}/{len(ticker_list)})...")
        try:
            df_ohlcv = polygon_wrapper.fetch_ohlcv(
                ticker_list=[ticker],
                start_date_str=start_date_str,
                end_date_str=end_date_str,
                timespan=args.timespan,
                multiplier=args.multiplier)
            if df_ohlcv.empty:
                print(f"No data fetched for {ticker}.")
            else:
                print(
                    f"Data for {ticker} fetched successfully. {len(df_ohlcv)} rows."
                )
                # 印出前10筆, 中間10筆, 最後10筆
                print("前10筆資料:")
                print(df_ohlcv.head(10))
                mid_point = len(df_ohlcv) // 2
                print("中間10筆資料:")
                print(df_ohlcv.iloc[mid_point - 5:mid_point + 5])
                print("最後10筆資料:")
                print(df_ohlcv.tail(10))
        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")
if __name__ == "__main__":
    main()
如果是免費帳號,則要修改下載程式,在每次list_aggs(...)API Call之間拉出間隔,並且時間只能是2年以內的資料。
class PolygonIOCore:
    ......
    def fetch(self,
              ticker: str,
              start_date_str: str,
              end_date_str: str,
              timespan: str,
              multiplier: int = 1) -> pd.DataFrame:
        ......
        for a in self.client.list_aggs(ticker=ticker,
                                       multiplier=multiplier,
                                       timespan=timespan,
                                       from_=start_date_str,
                                       to=end_date_str,
                                       limit=5000):
            aggs.append(a)
            ### 在每個API CALL之間加入Sleep (免費帳號, 每秒5次API CALL)
            time.sleep(12)
開始抓取S&P500以及DOW30的成分股,一小時框,近五年內的資料。python download.py --sp500 --dow30 --timespan hour --multiplier 1 --start 2019-09-21 --end 2024-09-21
在漫長的下載後,成功了下載了所有股票,結果後來發現DOW30的成分股全都是S&P500的成分股,總共抓了503支股票,不過看過下載玩的1hour股價後,發現還要解決的問題其實不少
從下面的log可以看到,這幾支股票,一樣的抓取條件,但抓到的筆數相差卻非常大
Data for MMM fetched successfully. 14954 rows.
Data for AOS fetched successfully. 10458 rows.
Data for ABT fetched successfully. 13442 rows.
Data for ABBV fetched successfully. 15239 rows.0 2019-09-23 08:00:00+00:00  139.1472  139.1472  139.1472  139.1472     119.600    MMM
0 2019-09-23 13:00:00+00:00  47.000  47.48  46.860  47.425   55140.0    AOS
0 2019-09-23 12:00:00+00:00  83.5800  83.580  83.5800  83.5800     731.0    ABT
0 2019-09-23 11:00:00+00:00  72.5000  73.0500  72.500  73.0500      640.0   ABBV
原因目前認為應該是,盤前盤後的交易造成的,每支股票不同。這樣的資料如果要用來訓練跟測試,至少需要把數據對齊才有辦法使用。
以下是部分log
python download.py --sp500 --dow30 --timespan hour --multiplier 1 --start 2019-09-21 --end 2024-09-21
Fetching data for MMM (1/533)...
Data saved to ./datasets\PolygonIODownloader\MMM_2019-09-21_2024-09-21_1_hour_raw.csv
Data for MMM fetched successfully. 14954 rows.
前10筆資料:
                  timestamp      open      high       low     close      volume ticker
0 2019-09-23 08:00:00+00:00  139.1472  139.1472  139.1472  139.1472     119.600    MMM
1 2019-09-23 12:00:00+00:00  139.7074  139.7074  139.4314  139.4314     241.592    MMM
2 2019-09-23 13:00:00+00:00  138.5033  139.0301  137.5418  138.8880  238668.976    MMM
3 2019-09-23 14:00:00+00:00  138.8462  138.8462  137.8094  138.1773  227292.624    MMM
4 2019-09-23 15:00:00+00:00  138.1886  138.8712  138.1689  138.6622  209298.804    MMM
5 2019-09-23 16:00:00+00:00  138.6455  139.6990  138.3870  139.3311  334236.552    MMM
6 2019-09-23 17:00:00+00:00  139.3729  139.8401  139.1973  139.5987  165931.844    MMM
7 2019-09-23 18:00:00+00:00  139.6237  139.8161  139.5151  139.7324  188353.256    MMM
8 2019-09-23 19:00:00+00:00  139.7324  139.9916  139.4398  139.4482  490236.812    MMM
9 2019-09-23 20:00:00+00:00  139.4314  139.4314  139.4314  139.4314   21472.984    MMM
最後10筆資料:
                      timestamp     open      high      low    close     volume ticker
14944 2024-09-20 12:00:00+00:00  133.800  133.8000  133.750  133.750      919.0    MMM
14945 2024-09-20 13:00:00+00:00  133.780  133.7800  132.771  133.215   858654.0    MMM
14946 2024-09-20 14:00:00+00:00  133.205  133.8500  133.020  133.810   414482.0    MMM
14947 2024-09-20 15:00:00+00:00  133.790  134.5142  133.760  134.280   279221.0    MMM
14948 2024-09-20 16:00:00+00:00  134.295  134.7600  134.210  134.445   241361.0    MMM
14949 2024-09-20 17:00:00+00:00  134.460  134.9500  134.290  134.710   442866.0    MMM
14950 2024-09-20 18:00:00+00:00  134.700  134.7800  134.105  134.595   388438.0    MMM
14951 2024-09-20 19:00:00+00:00  134.580  134.8000  134.000  134.760  1187421.0    MMM
14952 2024-09-20 20:00:00+00:00  134.770  135.3300  133.720  134.770   148209.0    MMM
14953 2024-09-20 22:00:00+00:00  134.770  134.7700  134.770  134.770    10357.0    MMM
Fetching data for AOS (2/533)...
Data saved to ./datasets\PolygonIODownloader\AOS_2019-09-21_2024-09-21_1_hour_raw.csv
Data for AOS fetched successfully. 10458 rows.
前10筆資料:
                  timestamp    open   high     low   close    volume ticker
0 2019-09-23 13:00:00+00:00  47.000  47.48  46.860  47.425   55140.0    AOS
1 2019-09-23 14:00:00+00:00  47.420  47.42  47.130  47.310   74617.0    AOS
2 2019-09-23 15:00:00+00:00  47.310  47.70  47.240  47.700  104694.0    AOS
3 2019-09-23 16:00:00+00:00  47.710  47.86  47.660  47.770   76278.0    AOS
4 2019-09-23 17:00:00+00:00  47.790  47.87  47.680  47.860   54102.0    AOS
5 2019-09-23 18:00:00+00:0 f0  47.860  47.88  47.710  47.740   59210.0    AOS
6 2019-09-23 19:00:00+00:00  47.735  47.96  47.735  47.740  203440.0    AOS
7 2019-09-23 20:00:00+00:00  47.770  47.77  47.770  47.770   32096.0    AOS
8 2019-09-24 13:00:00+00:00  47.950  48.24  47.650  47.720   72433.0    AOS
9 2019-09-24 14:00:00+00:00  47.715  47.74  47.300  47.680   86794.0    AOS
最後10筆資料:
                      timestamp    open     high      low   close    volume ticker
10448 2024-09-19 20:00:00+00:00  84.290  84.2900  84.2900  84.290    7204.0    AOS
10449 2024-09-20 10:00:00+00:00  84.200  84.2400  84.2000  84.240     422.0    AOS
10450 2024-09-20 13:00:00+00:00  84.020  84.1900  82.8200  82.875  127059.0    AOS
10451 2024-09-20 14:00:00+00:00  82.875  83.3800  82.6200  83.130   67117.0    AOS
10452 2024-09-20 15:00:00+00:00  83.070  83.2500  82.8650  83.010   54899.0    AOS
10453 2024-09-20 16:00:00+00:00  83.010  83.1600  82.9026  83.110   40683.0    AOS
10454 2024-09-20 17:00:00+00:00  83.140  83.4150  83.0300  83.110   37185.0    AOS
10455 2024-09-20 18:00:00+00:00  83.070  83.1200  82.7950  83.070   68247.0    AOS
10456 2024-09-20 19:00:00+00:00  83.070  83.2900  82.7200  82.900  282742.0    AOS
10457 2024-09-20 20:00:00+00:00  82.890  82.8916  82.8900  82.890   46638.0    AOS
Fetching data for ABT (3/533)...
Data saved to ./datasets\PolygonIODownloader\ABT_2019-09-21_2024-09-21_1_hour_raw.csv
Data for ABT fetched successfully. 13442 rows.
前10筆資料:
                  timestamp     open    high      low    close    volume ticker
0 2019-09-23 12:00:00+00:00  83.5800  83.580  83.5800  83.5800     731.0    ABT
1 2019-09-23 13:00:00+00:00  83.6000  83.630  83.1900  83.4200  199670.0    ABT
2 2019-09-23 14:00:00+00:00  83.4300  83.590  83.2700  83.4800  198560.0    ABT
3 2019-09-23 15:00:00+00:00  83.4800  83.720  83.4200  83.6496  210529.0    ABT
4 2019-09-23 16:00:00+00:00  83.6323  83.664  83.4500  83.4800  126953.0    ABT
5 2019-09-23 17:00:00+00:00  83.4900  83.490  83.1300  83.1600  168024.0    ABT
6 2019-09-23 18:00:00+00:00  83.1732  83.310  83.1732  83.2600  156499.0    ABT
7 2019-09-23 19:00:00+00:00  83.2500  83.430  83.1600  83.1700  465495.0    ABT
8 2019-09-23 20:00:00+00:00  83.1600  83.160  83.1600  83.1600  158481.0    ABT
9 2019-09-23 21:00:00+00:00  83.2000  83.200  83.1600  83.1600     714.0    ABT
最後10筆資料:
                      timestamp     open      high       low     close     volume ticker
13432 2024-09-20 12:00:00+00:00  114.360  114.3600  114.3600  114.3600      152.0    ABT
13433 2024-09-20 13:00:00+00:00  113.620  114.1300  113.2000  113.8700  1383365.0    ABT
13434 2024-09-20 14:00:00+00:00  113.950  114.1000  113.5550  113.8350   575450.0    ABT
13435 2024-09-20 15:00:00+00:00  113.825  114.1200  113.6950  113.7000   416296.0    ABT
13436 2024-09-20 16:00:00+00:00  113.690  113.8000  113.5500  113.6700   236489.0    ABT
13437 2024-09-20 17:00:00+00:00  113.680  114.1600  113.6750  114.1110   324759.0    ABT
13438 2024-09-20 18:00:00+00:00  114.120  114.1500  113.5650  113.7150   308884.0    ABT
13439 2024-09-20 19:00:00+00:00  113.710  113.9100  113.4420  113.6900  1218445.0    ABT
13440 2024-09-20 20:00:00+00:00  113.700  113.7016  113.6300  113.6300   130750.0    ABT
13441 2024-09-20 22:00:00+00:00  113.700  113.7000  113.3319  113.3319      388.0    ABT
Fetching data for ABBV (4/533)...
Data saved to ./datasets\PolygonIODownloader\ABBV_2019-09-21_2024-09-21_1_hour_raw.csv
Data for ABBV fetched successfully. 15239 rows.
前10筆資料:
                  timestamp     open     high     low    close     volume ticker
0 2019-09-23 11:00:00+00:00  72.5000  73.0500  72.500  73.0500      640.0   ABBV
1 2019-09-23 12:00:00+00:00  73.3500  73.3500  72.390  72.8600     3817.0   ABBV
2 2019-09-23 13:00:00+00:00  72.9000  72.9611  72.050  72.7817   717311.0   ABBV
3 2019-09-23 14:00:00+00:00  72.7600  73.3000  72.760  73.0850   859381.0   ABBV
4 2019-09-23 15:00:00+00:00  73.0900  73.5400  73.010  73.4100   880159.0   ABBV
5 2019-09-23 16:00:00+00:00  73.4174  73.6800  73.360  73.5800   801864.0   ABBV
6 2019-09-23 17:00:00+00:00  73.5800  73.7800  73.570  73.7500   702725.0   ABBV
7 2019-09-23 18:00:00+00:00  73.7490  73.7600  73.325  73.3300  1091298.0   ABBV
8 2019-09-23 19:00:00+00:00  73.3300  73.3900  72.910  72.9700  1556149.0   ABBV
9 2019-09-23 20:00:00+00:00  72.9700  73.2000  72.910  73.1000   249423.0   ABBV
最後10筆資料:
                      timestamp     open      high     low     close     volume ticker
15229 2024-09-20 13:00:00+00:00  193.000  193.5000  192.13  192.9550  1307749.0   ABBV
15230 2024-09-20 14:00:00+00:00  193.130  193.8210  192.89  193.1300   597094.0   ABBV
15231 2024-09-20 15:00:00+00:00  193.100  193.2600  192.58  192.6550   317190.0   ABBV
15232 2024-09-20 16:00:00+00:00  192.640  193.0100  192.41  192.9801   196176.0   ABBV
15233 2024-09-20 17:00:00+00:00  193.005  193.6100  192.99  193.4150   206329.0   ABBV
15234 2024-09-20 18:00:00+00:00  193.410  193.4400  192.56  192.8300   778455.0   ABBV
15235 2024-09-20 19:00:00+00:00  192.830  193.5550  192.60  193.4600   879074.0   ABBV
15236 2024-09-20 20:00:00+00:00  193.470  193.4716  193.00  193.4700   455551.0   ABBV
15237 2024-09-20 21:00:00+00:00  193.280  193.2900  193.28  193.2900      510.0   ABBV
15238 2024-09-20 22:00:00+00:00  193.470  193.4700  193.47  193.4700     1311.0   ABBV
像是'BRK.B'從2019-09-30以後才開始有資料,不知道為啥,應該不可能是那時候才上市,可能有其他原因。
                  timestamp     open    high     low   close    volume ticker
0 2019-09-30 09:00:00+00:00  207.750  207.75  207.75  207.75     100.0  BRK.B
1 2019-09-30 11:00:00+00:00  207.550  207.69  207.55  207.63    5824.0  BRK.B
2 2019-09-30 12:00:00+00:00  207.600  207.60  207.60  207.60     707.0  BRK.B
3 2019-09-30 13:00:00+00:00  207.800  208.30  207.77  207.95  216762.0  BRK.B
4 2019-09-30 14:00:00+00:00  207.945  208.00  207.55  207.96  221958.0  BRK.B
所以接下來我需要花時間研究一下,如何處理、對齊這些資料,並把這些資料切分成train和trade兩個部分,之後進行訓練。